I.Setup

Libraries needed: [Note: I wrote results hide but the ## are not results maybe??]

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)

Loading data & Checking out how the loaded data looks like:

mouse.data <- Read10X(data.dir = "/Users/xiaoyizheng/Downloads/Praktikum/Bioinfo_Einarbeiten/CellChatSelf/mouse/GSE113854_RAW/")

mouse.data
##   [[ suppressing 32 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]
##   [[ suppressing 32 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]
##   [[ suppressing 32 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]

Creating Seurat object:

mouse <- CreateSeuratObject(counts = mouse.data, project = "MoSk", min.cells = 3, min.features = 200)

II. Pre-processing

QC (quality control) Matrix for selection and filtration of cells, using user-defined criteria. In this case it is the mitochondrial genes as criteria. [note: apparently they all have same level of mt content. Maybe pre-processed already? or maybe that is not the proper criteria to choose in this case. Which one could be better instead of it?]

mouse[["percent.mt"]] <- PercentageFeatureSet(mouse, pattern = "^MT-") 

#Visualization by violin plot (mt percent no significance):
VlnPlot(mouse, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) 
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.mt.

Genes and total molecules calculated and stored in obj. meta data. Showing QC metrics for the first 5 cells:

head(mouse@meta.data, 5) 
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATTCCC-1       MoSk       1475          783          0
## AAACCTGAGACCGGAT-1       MoSk       4148         1624          0
## AAACCTGAGACGCACA-1       MoSk       3136         1342          0
## AAACCTGAGATGTGGC-1       MoSk       3138         1323          0
## AAACCTGAGCTGCGAA-1       MoSk       4029         1721          0
#examining a few genes in the first thirty cells
mouse.data[c("Igf2", "Cck", "Pf4"), 1:30] 
## 3 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]
##                                                                   
## Igf2 . . . . . 1 . .  . . . . . .  . . . 1 . . . . . . . . . . . .
## Cck  . . . . . . . .  . . . . . .  . . . . . . . . . . . . . . . .
## Pf4  . . . . . . . . 11 . . . . . 25 . . 1 1 . 1 1 . 1 . . . . . 2
#a few genes in the first 30 cells 
#picked "Igf2", "Cck", "Pf4" according to web

Visualization of feature-feature relationships

plot1 <- FeatureScatter(mouse, feature1 = "nCount_RNA", feature2 = "percent.mt") 
## Warning in cor(x = data[, 1], y = data[, 2]): the standard deviation is zero
#since all have same mt quality...

plot2 <- FeatureScatter(mouse, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
#positively correlated

plot1 + plot2

III. Normalization

since it is already normalized => this step omitted

IV. Feature Selection

mouse <- FindVariableFeatures(mouse, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(mouse), 10) # Identify the 10 most highly variable genes
top10
##  [1] "Hbb-bs" "Hba-a1" "Ccl21a" "Hbb-bt" "Saa3"   "Acp5"   "Ccl5"   "S100a8"
##  [9] "Mmp9"   "S100a9"
plot1 <- VariableFeaturePlot(mouse)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2

V. Scaling

all.genes <- rownames(mouse)
mouse <- ScaleData(mouse, features = all.genes)
## Centering and scaling data matrix
#optional: mouse <- ScaleData(mouse)

VI. Linear Dimension Reduction

Perform PCA

mouse <- RunPCA(mouse, features = VariableFeatures(object = mouse))
## PC_ 1 
## Positive:  Sparc, Bgn, Col1a2, Col1a1, Col3a1, Col6a1, Lgals1, Serpinh1, Col5a2, Fstl1 
##     Tmsb10, Col6a2, Col5a1, Col12a1, Aebp1, Dcn, Postn, Meg3, Fn1, Col6a3 
##     Mmp2, Fbln2, Lum, S100a6, Fosb, Loxl1, Thbs2, Serpinf1, Lrrc15, Tuba1a 
## Negative:  Tyrobp, Fcer1g, Lyz2, C1qb, Apoe, Ctss, C1qa, C1qc, Ms4a7, Aif1 
##     Cd52, Pf4, Wfdc17, Laptm5, Fth1, Cd74, Ms4a6c, Lgals3, Clec4n, Cd68 
##     Lst1, Alox5ap, H2-Aa, Ptpn18, Ucp2, Id2, AF251705, Trem2, Fcgr3, Clec4a2 
## PC_ 2 
## Positive:  Col1a1, Col1a2, Col3a1, Mfap4, Lum, Postn, Col5a2, Col12a1, Dcn, Thbs2 
##     Igf1, Aebp1, Mdk, Mmp2, Itm2a, Col5a1, Spon1, Lox, Ptn, Olfml3 
##     Dpt, Cthrc1, Fndc1, Aspn, Serpinf1, Lrrc15, Col6a2, Mfap5, Col6a1, Prss35 
## Negative:  Cdh5, Col4a1, Col4a2, Pecam1, Egfl7, Igfbp7, Cd93, Crip2, Cav1, Ramp2 
##     Plvap, Vim, Col15a1, Myct1, Kdr, Adgrf5, Esam, Pdlim1, Emcn, Ctla2a 
##     Ecscr, Sparcl1, Col18a1, Eng, Adgrl4, Mcam, Itga6, Gimap6, Ehd4, F11r 
## PC_ 3 
## Positive:  Sparcl1, Rgs5, Il6, Tinagl1, Igfbp7, Col4a1, Col4a2, Gm13889, Fabp4, Ebf1 
##     Ly6c1, Sept4, Emcn, Adgrf5, Ndufa4l2, Pecam1, Esam, Adgrl4, Flt1, Notch3 
##     Adamts9, Epas1, Des, Bcr, Vtn, Mcam, 2200002D01Rik, Higd1b, Apold1, Cygb 
## Negative:  Ftl1, Tyrobp, Fth1, Fcer1g, Lyz2, Cyba, Lgals3, Apoe, C1qb, Ctss 
##     C1qc, C1qa, Cstb, Laptm5, Tmsb4x, Lst1, Ctsd, Pf4, Ms4a7, Cd68 
##     Lgmn, Gm10116, Clec4n, Aif1, Wfdc17, Csf1r, Id2, Alox5ap, Ms4a6c, Ucp2 
## PC_ 4 
## Positive:  Rgs5, Ndufa4l2, Gm13889, Crip1, Acta2, Higd1b, Des, Notch3, Cox4i2, Thy1 
##     Mylk, Pdgfa, S100a4, Serpine2, Ppp1r14a, Il6, Myl9, Rasgrp2, Ebf1, Tuba1b 
##     2810417H13Rik, Cygb, Stmn1, Mgp, Ube2c, Mustn1, Cks2, Col4a1, Tpm1, H2afz 
## Negative:  Mfap4, Cdh5, Pecam1, Igf1, Col1a2, Cd34, Cd93, Ptn, Col3a1, Col1a1 
##     Ramp2, S100a16, Egfl7, Mest, Lum, Myct1, Dpt, Thbs2, Mdk, Itm2a 
##     Aebp1, Col14a1, Kdr, Ly6c1, Ogn, Ctla2a, C1qtnf3, Adgrl4, Emcn, Plvap 
## PC_ 5 
## Positive:  Birc5, Stmn1, H2afz, Ube2c, Top2a, Hmgb2, Tuba1b, 2810417H13Rik, Cenpa, Cks2 
##     Cdca3, 2700094K13Rik, Cks1b, Ccnb2, Tubb5, Hmgb1, Cdk1, H2afx, Smc2, Lockd 
##     Ccnb1, Cdca8, Ccna2, Prc1, Ccdc34, H2afv, Ube2s, Hmgn2, Nusap1, Spc24 
## Negative:  Rgs5, Cxcl1, Gm13889, Il6, Meg3, Serping1, Errfi1, Serpine2, Mylk, Col4a1 
##     Notch3, Ndufa4l2, Ccl2, Ebf1, Mt1, Col4a2, Des, Neat1, Phlda1, Cygb 
##     Dnajb1, Malat1, Apold1, Col18a1, Tnfaip2, Adamts4, Higd1b, Nr4a1, Sparcl1, Bcr

Examine and visualize PCA results a few different ways

print(mouse[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  Sparc, Bgn, Col1a2, Col1a1, Col3a1 
## Negative:  Tyrobp, Fcer1g, Lyz2, C1qb, Apoe 
## PC_ 2 
## Positive:  Col1a1, Col1a2, Col3a1, Mfap4, Lum 
## Negative:  Cdh5, Col4a1, Col4a2, Pecam1, Egfl7 
## PC_ 3 
## Positive:  Sparcl1, Rgs5, Il6, Tinagl1, Igfbp7 
## Negative:  Ftl1, Tyrobp, Fth1, Fcer1g, Lyz2 
## PC_ 4 
## Positive:  Rgs5, Ndufa4l2, Gm13889, Crip1, Acta2 
## Negative:  Mfap4, Cdh5, Pecam1, Igf1, Col1a2 
## PC_ 5 
## Positive:  Birc5, Stmn1, H2afz, Ube2c, Top2a 
## Negative:  Rgs5, Cxcl1, Gm13889, Il6, Meg3
VizDimLoadings(mouse, dims = 1:2, reduction = "pca")

DimPlot(mouse, reduction = "pca")

heatmap: exploration of the primary sources of heterogeneity -> decide which PCs to include for further downstream analyses

DimHeatmap(mouse, dims = 1, cells = 500, balanced = TRUE) #heatmap: exploration of the primary sources of heterogeneity

DimHeatmap(mouse, dims = 1:15, cells = 500, balanced = TRUE)

# VII. Determination of Dimension Metafeature combines information across a correlated feature set. top principal components => robust compression how many PCs to take? ###JackStraw procedure permute subset of the data (1% by default) and rerun PCA significant PCs ~ strong enrichment of low p-value features

mouse <- JackStraw(mouse, num.replicate = 100)
mouse <- ScoreJackStraw(mouse, dims = 1:20)
JackStrawPlot(mouse, dims = 1:15)
## Warning: Removed 21000 rows containing missing values (`geom_point()`).

alternative heuristic method Elbow plot: observe where the elbow is and then see which PCs to choose

ElbowPlot(mouse)

# VIII. Clustering Cells

mouse <- FindNeighbors(mouse, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
mouse <- FindClusters(mouse, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22322
## Number of edges: 704154
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9039
## Number of communities: 15
## Elapsed time: 3 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(mouse), 5)
## AAACCTGAGAATTCCC-1 AAACCTGAGACCGGAT-1 AAACCTGAGACGCACA-1 AAACCTGAGATGTGGC-1 
##                  3                  1                 13                  1 
## AAACCTGAGCTGCGAA-1 
##                  7 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14

IX. Non-linear Dimension Reduction

e.g. UMAP, tSNE

input to the UMAP and tSNE: same PCs as input to the clustering analysis

mouse <- RunUMAP(mouse, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:24:37 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:24:37 Read 22322 rows and found 10 numeric columns
## 17:24:37 Using Annoy for neighbor search, n_neighbors = 30
## 17:24:37 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:24:39 Writing NN index file to temp file /var/folders/w_/82bj411144v93vqxt1tbsg9h0000gn/T//Rtmp5dFdA6/file9cc3fecef22
## 17:24:39 Searching Annoy index using 1 thread, search_k = 3000
## 17:24:46 Annoy recall = 100%
## 17:24:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:24:47 Initializing from normalized Laplacian + noise (using irlba)
## 17:24:48 Commencing optimization for 200 epochs, with 905958 positive edges
## 17:25:00 Optimization finished
#note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
DimPlot(mouse, reduction = "umap")

#saveRDS(mouse, file = "/Users/xiaoyizheng/Downloads/Bioinfo_Einarbeiten/CellChatSelf/mouse/output/mouseOutput.rds")

X. Cluster Biomarkers

Finding differentially expressed features

  • min.pct argument: requires a feature to be detected at a minimum percentage in either of the two groups of cells
  • thresh.test argument: requires a feature to be differentially expressed (on average) by some amount between the two groups
  • to speed up computations: max.cells.per.ident
# find all markers of cluster 2
cluster2.markers <- FindMarkers(mouse, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
##         p_val avg_log2FC pct.1 pct.2 p_val_adj
## Col3a1      0  0.9038735 1.000 0.962         0
## Fmod        0  0.4638764 0.308 0.068         0
## Angptl1     0  0.4226594 0.327 0.075         0
## Dpt         0  0.7186185 0.715 0.330         0
## Mdk         0  0.8305024 0.836 0.405         0
cluster5.markers <- FindMarkers(mouse, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
##        p_val avg_log2FC pct.1 pct.2 p_val_adj
## Col3a1     0 -1.9613086 0.873 0.997         0
## Col5a2     0 -1.1786174 0.381 0.858         0
## Col6a3     0 -1.1001785 0.400 0.882         0
## Selp       0  0.9992272 0.312 0.008         0
## F11r       0  0.5869482 0.373 0.011         0
mouse.markers <- FindAllMarkers(mouse, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)  #No features pass logfc.threshold threshold, maybe threshold=0.1, 0.15, 0.2?
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
mouse.markers <- FindAllMarkers(mouse, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.2) #0.2 is okki
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
mouse.markers %>%
  group_by(cluster) %>%
  slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 29 × 7
## # Groups:   cluster [15]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene  
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr> 
##  1 1.59e-116      0.256 0.764 0.531 2.72e-112 0       Crabp1
##  2 1.26e- 99      1.42  0.338 0.186 2.15e- 95 1       Saa3  
##  3 0              1.04  0.909 0.513 0         1       Crabp1
##  4 0              1.58  0.91  0.427 0         2       Ptn   
##  5 0              1.41  0.794 0.328 0         2       H19   
##  6 0              2.13  0.958 0.234 0         3       Rgs5  
##  7 0              1.35  0.422 0.123 0         3       Mgp   
##  8 0              1.06  0.739 0.359 0         4       Il1b  
##  9 6.01e-232      1.01  0.61  0.284 1.03e-227 4       Cd74  
## 10 0              2.52  0.902 0.204 0         5       Ctla2a
## # … with 19 more rows
#cluster0.markers <- FindMarkers(mouse, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
cluster0.markers <- FindMarkers(mouse, ident.1 = 0, logfc.threshold = 0.2, test.use = "roc", only.pos = TRUE)
VlnPlot(mouse, features = c("Rgs5", "Tyrobp"))

VlnPlot(mouse, features = c("Col1a1", "Col1a2"), slot = "counts", log = TRUE) #plot raw counts

FeaturePlot(mouse, features = c("Sparc", "Tyrobp", "Igfbp7", "Pecam1", "Cd34", "Mfap4", "Ndufa4l2", "Il6",
                               "Birc5"))

mouse.markers %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(mouse, features = top10$gene) + NoLegend()

XI. Assigning Cell Type Identity to Clusters